require(quanteda)
require(stm)
require(mvtnorm)
require(dichromat)
Paired <- colorschemes$Categorical.12

#### covert the data to stm format ####
load("dfm_matrix.Rdata")
dim(dfm.matrix)  # number of candidates and terms
table(docvars(dfm.matrix, "female"))  # number of women candidates

stm.data <- convert(dfm.matrix, to = "stm")
stm.data$meta$year <- factor(stm.data$meta$year)
stm.data$meta$party <- factor(stm.data$meta$party,
                              levels = c("others", "LDP", "JSP", "Komeito", "DSP", "JCP", "NFP", "DPJ", "left", "right"))
stm.data$meta$party.year <- paste(stm.data$meta$party, 
                                  stm.data$meta$year, sep = ".")

#### specify the number of topics (Online Appendix A.2) ####
## from 30- to 130-topic models
number.of.topics.model1.1 <- seq(30, 130, 10)

for (i in 1:length(number.of.topics.model1.1)) {
  set.seed(number.of.topics.model1.1[i])
  STM.result.model1 <- stm(documents = stm.data$documents, 
                           vocab = stm.data$vocab, 
                           K = number.of.topics.model1.1[i], 
                           prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                           data = stm.data$meta)
  save(STM.result.model1, file = paste0("STM_result_model1_", number.of.topics.model1.1[i], ".Rdata"))
}

exclusivity.res.model1.1 <- SC.res.model1.1 <- rep(NA, length(number.of.topics.model1.1))
for (i in 1:length(number.of.topics.model1.1)) {
  load(paste0("STM_result_model1_", number.of.topics.model1.1[i], ".Rdata"))
  exclusivity.res.model1.1[i] <- mean(exclusivity(STM.result.model1))
  SC.res.model1.1[i] <- mean(semanticCoherence(STM.result.model1, stm.data$documents))
}

# Figure A.1
cairo_pdf("Figure_A1.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.model1.1)), SC.res.model1.1, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.model1.1), 2), 
     labels = number.of.topics.model1.1[seq(1, length(number.of.topics.model1.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.model1.1)), exclusivity.res.model1.1, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.model1.1), 2), 
     labels = number.of.topics.model1.1[seq(1, length(number.of.topics.model1.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

## from 70- to 90-topic models
number.of.topics.model1.2 <- c(71:79, 81:89)

for (i in 1:length(number.of.topics.model1.2)) {
  set.seed(number.of.topics.model1.2[i])
  STM.result.model1 <- stm(documents = stm.data$documents, 
                           vocab = stm.data$vocab, 
                           K = number.of.topics.model1.2[i], 
                           prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                           data = stm.data$meta)
  save(STM.result.model1, file = paste0("STM_result_model1_", number.of.topics.model1.2[i], ".Rdata"))
}

number.of.topics.model1.3 <- 70:90

exclusivity.res.model1.2 <- SC.res.model1.2 <- rep(NA, length(number.of.topics.model1.3))
for (i in 1:length(number.of.topics.model1.3)) {
  load(paste0("STM_result_model1_", number.of.topics.model1.3[i], ".Rdata"))
  exclusivity.res.model1.2[i] <- mean(exclusivity(STM.result.model1))
  SC.res.model1.2[i] <- mean(semanticCoherence(STM.result.model1, stm.data$documents))
}

# Figure A.2
cairo_pdf("Figure_A2.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.model1.3)), SC.res.model1.2, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.model1.3), 2), 
     labels = number.of.topics.model1.3[seq(1, length(number.of.topics.model1.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.model1.3)), exclusivity.res.model1.2, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.model1.3), 2), 
     labels = number.of.topics.model1.3[seq(1, length(number.of.topics.model1.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

#### list of topics (Online Appendix B.1) ####
load("STM_result_model1_75.Rdata")

topic.order <- order(colMeans(STM.result.model1$theta), decreasing = TRUE)
data.frame(labelTopics(STM.result.model1, n = 5)$prob, 
           prob = round(colMeans(STM.result.model1$theta), 3) * 100)[topic.order, ]

topic.labels <- c("Political activity", "Ethical claims", "DPJ manifesto (2009)", 
                  "Criticism of the government (1993)", "Appeals of their sincerity", 
                  "Criticism of the government (2005)", "Postal privatization", "Energy policy", 
                  "Privatization of the JH", "Abolition of the consumption tax", "JCP manifesto (1996)", 
                  "Criticism of the government (2003)", "Appeals for youthfulness", 
                  "Criticism of the government (2009)", "Taxation in general", 
                  "Political reform", "Nursing", "Regime change", "Environmental issues", 
                  "NFP manifesto", "Criticism of the government (1990)", "JCP manifesto (2000)", 
                  "Criticism of the government (1986)", "Railways", "World affairs", 
                  "Decentralization", "Diplomacy in Asia", "Political ethics", "DPJ manifesto (2005)", 
                  "Reform within the LDP", "Elderly care", "Candidate information", "Future of Japan", 
                  "Community development", "Local development", "U.S. military bases", 
                  "Legislative records", "Abstract principles", "Health inequality", 
                  "JCP manifesto (2005)", "Appeals for change", "Policy support for mothers", 
                  "National problems in general", "Increasing domestic demand", 
                  "Regulation of political funds", "Agricultural administration", "Gender equality", 
                  "Urban development", "Science and technology", "Small and medium-sized enterprises", 
                  "Administration reform", "Economic measures", "Infrastructure", "Local tourism", 
                  "Appeals for novelty", "Coal mining", "Administrative efficiency", "Child care", 
                  "Appeals of their efforts", "Criticism of the government (2000)", 
                  "Freedom of information", "Agriculture and fisheries", "Fiscal reconstruction", 
                  "Criticism of the government (1996)", "Appeals for problem-solving ability", 
                  "Traffic network development", "Disaster prevention", "Title", "Pacifism", 
                  "Public pension", "Appeals for hearing voters' demands", 
                  "Policies of the Koizumi government", "Chugoku and Shikoku", 
                  "Pensions in general", "Aged society")
colnames(STM.result.model1$theta) <- topic.labels

#### post-estimation simulation ####
## conduct the post-estimation simulation
set.seed(12345)
post.sim.model1 <- estimateEffect(1:75 ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                                  STM.result.model1, stm.data$meta, nsim = 100)
summary.post.sim.model1 <- summary(post.sim.model1)

## which topics show a large gender difference
gender.diff.model1 <- matrix(NA, 75, 4)
for (i in 1:75) {
  gender.diff.model1[i, ] <- summary.post.sim.model1$tables[[i]][2, ]  # the second row corresponds to the coefficient of the female dummy
}
rownames(gender.diff.model1) <- topic.labels
significant.topics <- p.adjust(gender.diff.model1[, 4], "holm") < 0.05
round(gender.diff.model1[significant.topics, ], 3)  # topics whose gender difference is significant with the Holm correction
gender.diff.model1.order <- order(gender.diff.model1[, 1], decreasing = TRUE)

significant.topic.id <- which(significant.topics)
significant.topic.order <- order(gender.diff.model1[significant.topics, 1], 
                                 decreasing = TRUE)
distinctive.topic.id <- significant.topic.id[significant.topic.order]

## top 10 most frequent words within the five distinctive topics (Table 1)
labelTopics(STM.result.model1, n = 10)$prob[distinctive.topic.id, ]

## manifestos most closely related to the five distinctive topics (Online Appendix B.2)
for (i in 1:5) {
  print(topic.labels[distinctive.topic.id[i]])
  print(stm.data$meta[which.max(STM.result.model1$theta[, distinctive.topic.id[i]]), ])
  print(round(head(sort(STM.result.model1$theta[which.max(STM.result.model1$theta[, distinctive.topic.id[i]]), ], 
                        decreasing = TRUE), 3), 3))
}

#### compute the first-differences (FDs) (main text and Online Appendix C) ####
## function implementing the procedure explained in Online Appendix A.3
predict.fun.model1 <- function(result, covname, cov.value1, cov.value2, sim.beta) {
  cdata <- result$data
  cdata[, covname] <- cov.value1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.1 <- tcrossprod(design.matrix, sim.beta)
  cdata[, covname] <- cov.value2
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.2 <- tcrossprod(design.matrix, sim.beta)
  dif.sim.result <- sim.result.1 - sim.result.2
  dif.sim.mean <- mean(dif.sim.result)
  dif.sim.CI <- quantile(colMeans(dif.sim.result), c(0.025, 0.975))
  c(dif.sim.mean,  # point estimate of FD
    dif.sim.CI,  # confidence interval
    mean(sim.result.1),  # expected prevalence when covname takes cov.value1
    mean(sim.result.2))  # expected prevalence when covname takes cov.value2
}

## compute the FDs for gender, JSP (vs. LDP), and DPJ (vs. LDP)
female.sim.result.model1 <- JSP.sim.result.model1 <- DPJ.sim.result.model1 <- matrix(NA, 75, 5)
rownames(female.sim.result.model1) <- rownames(JSP.sim.result.model1) <- 
  rownames(DPJ.sim.result.model1) <- topic.labels
for (i in 1:75) {
  set.seed(12345)
  sim.beta <- do.call(rbind, 
                      lapply(post.sim.model1$parameters[[i]], 
                             function(x) rmvnorm(100, x$est, x$vcov)))
  female.sim.result.model1[i, ] <- predict.fun.model1(post.sim.model1, "female", 1, 0, sim.beta)
  JSP.sim.result.model1[i, ] <- predict.fun.model1(post.sim.model1, "party", "JSP", "LDP", sim.beta)
  DPJ.sim.result.model1[i, ] <- predict.fun.model1(post.sim.model1, "party", "DPJ", "LDP", sim.beta)
}

## how large the significant differences are
round(female.sim.result.model1[significant.topics, ], 3)
round(female.sim.result.model1[significant.topics, 4] / 
        female.sim.result.model1[significant.topics, 5], 2)  # women-men ratios

# Figure 1
cairo_pdf("Figure_1.pdf", 
          width = 6, height = 2, pointsize = 8, family = "Helvetica")
par(mar = c(4.5, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.025, 0.022), ylim = c(0.5, 5.5), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.01, 0.02, 0.01), col = "gray", lty = c(3, 1, 3, 3, 3))
segments(rep(-0.012, 5), 1:5, 
         rep(0.022, 5), 1:5, lty = 3, col = "gray")
points(female.sim.result.model1[distinctive.topic.id, 1], 
       5:1, pch = 19)
segments(female.sim.result.model1[distinctive.topic.id, 2], 5:1, 
         female.sim.result.model1[distinctive.topic.id, 3], 5:1)
text(rep(-0.012, 5), 5:1, 
     labels = topic.labels[distinctive.topic.id], pos = 2)
axis(1, at = c(seq(-0.01, 0.02, 0.01)), lwd = 0.5)
mtext(c("more prevalent\namong men", "more prevalent\namong women"), 
      side = 1, at = c(-0.01, 0.02), line = 3)
dev.off()

# Figure A.4
cairo_pdf("Figure_A4.pdf", 
          width = 6, height = 7, pointsize = 7, family = "Helvetica")
par(mar = c(4, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.035, 0.022), ylim = c(1, 75), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.02, 0.02, 0.01), col = "gray")
segments(rep(-0.022, 75), 1:75, 
         rep(0.022, 75), 1:75, lty = 3, col = "gray")
points(female.sim.result.model1[gender.diff.model1.order, 1], 75:1, 
       pch = ifelse(significant.topics[gender.diff.model1.order], 15, 19), 
       col = ifelse(significant.topics[gender.diff.model1.order], Paired[12], "black"))
segments(female.sim.result.model1[gender.diff.model1.order, 2], 75:1, 
         female.sim.result.model1[gender.diff.model1.order, 3], 75:1, 
         col = ifelse(significant.topics[gender.diff.model1.order], Paired[12], "black"))
text(rep(-0.022, 75), 75:1, 
     labels = topic.labels[gender.diff.model1.order], pos = 2, cex = 0.8)
axis(1, at = c(seq(-0.02, 0.02, 0.01)), lwd = 0.5)
mtext(c("more prevalent\namong men", "more prevalent\namong women"), 
      side = 1, at = c(-0.02, 0.02), line = 3)
dev.off()

## comparison with inter-party differences
JSP.results.order <- order(JSP.sim.result.model1[, 1], decreasing = TRUE)
JSP.sim.result.model1[JSP.results.order[1], 1]  # topic with the largest difference b/w LDP and JSP

DPJ.results.order <- order(DPJ.sim.result.model1[, 1], decreasing = TRUE)
DPJ.sim.result.model1[DPJ.results.order[1], 1]  # topic with the largest difference b/w LDP and DPJ

# difference b/w LDP and DPJ in the prevalence of major topics
round(DPJ.sim.result.model1["Elderly care", 1], 3)
round(DPJ.sim.result.model1["Public pension", 1], 3)
round(DPJ.sim.result.model1["Gender equality", 1], 3)

# Figure A.5
cairo_pdf("Figure_A5.pdf", 
          width = 6, height = 7, pointsize = 7, family = "Helvetica")
par(mar = c(4, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.075, 0.082), ylim = c(1, 75), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.04, 0.08, 0.02), col = "gray")
segments(rep(-0.042, 75), 1:75, 
         rep(0.082, 75), 1:75, lty = 3, col = "gray")
points(JSP.sim.result.model1[JSP.results.order, 1], 75:1, pch = 19)
segments(JSP.sim.result.model1[JSP.results.order, 2], 75:1, 
         JSP.sim.result.model1[JSP.results.order, 3], 75:1)
text(rep(-0.042, 75), 75:1, 
     labels = topic.labels[JSP.results.order], pos = 2, cex = 0.8)
axis(1, at = c(seq(-0.04, 0.08, 0.02)), lwd = 0.5)
mtext(c("more prevalent\namong LDP candidates", "more prevalent\namong JSP candidates"), 
      side = 1, at = c(-0.035, 0.075), line = 3)
dev.off()

# Figure A.6
cairo_pdf("Figure_A6.pdf", 
          width = 6, height = 7, pointsize = 7, family = "Helvetica")
par(mar = c(4, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.075, 0.082), ylim = c(1, 75), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.04, 0.08, 0.02), col = "gray")
segments(rep(-0.042, 75), 1:75, 
         rep(0.082, 75), 1:75, lty = 3, col = "gray")
points(DPJ.sim.result.model1[DPJ.results.order, 1], 75:1, pch = 19)
segments(DPJ.sim.result.model1[DPJ.results.order, 2], 75:1, 
         DPJ.sim.result.model1[DPJ.results.order, 3], 75:1)
text(rep(-0.042, 75), 75:1, 
     labels = topic.labels[DPJ.results.order], pos = 2, cex = 0.8)
axis(1, at = c(seq(-0.04, 0.08, 0.02)), lwd = 0.5)
mtext(c("more prevalent\namong LDP candidates", "more prevalent\namong DPJ candidates"), 
      side = 1, at = c(-0.035, 0.075), line = 3)
dev.off()